Data Collection
This section explains some of our data collection techniques. As mentioned in the main article we took advantage of a git project called Tweepy that allows us to easily interact with the Streaming API. To utilize Tweepy we used python scripts from badhessian.org. These scripts required only a little bit of modification for our purposes. There is slistener.py which creates a listener that will stream the tweepy interaction while the streaming.py file saves the tweets to a file.
Code to create a slistener object that will stream
from tweepy import StreamListener
import json, time, sys
from time import sleep
class SListener(StreamListener):
def __init__(self, api = None, fprefix = 'streamer'):
self.api = api or API()
self.counter = 0
self.fprefix = fprefix
self.output = open('../Data/' + fprefix + '.'
+ time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.delout = open('delete.txt', 'a')
def on_data(self, data):
if 'in_reply_to_status' in data:
self.on_status(data)
elif 'delete' in data:
delete = json.loads(data)['delete']['status']
if self.on_delete(delete['id'], delete['user_id']) is False:
return False
elif 'limit' in data:
if self.on_limit(json.loads(data)['limit']['track']) is False:
return False
elif 'warning' in data:
warning = json.loads(data)['warnings']
print warning['message']
return false
def on_status(self, status):
self.output.write(status + "\n")
self.counter += 1
print self.counter
sleep(0.2)
if self.counter >= 20000:
self.output.close()
self.output = open('../streaming_data/' + self.fprefix + '.'
+ time.strftime('%Y%m%d-%H%M%S') + '.json', 'w')
self.counter = 0
return
def on_delete(self, status_id, user_id):
self.delout.write( str(status_id) + "\n")
return
def on_limit(self, track):
sys.stderr.write(track + "\n")
return
def on_error(self, status_code):
sys.stderr.write('Error: ' + str(status_code) + "\n")
return False
def on_timeout(self):
sys.stderr.write("Timeout, sleeping for 60 seconds...\n")
time.sleep(60)
return
Code that will save data coming from slistener to a file
from slistener import SListener
import time, tweepy, sys
from time import sleep
## authentication
##username = '' ## put a valid Twitter username here
##password = '' ## put a valid Twitter password here
auth = tweepy.OAuthHandler('OwaEbdPfLJVnTuadkpFoQKs4m', 'wSWZ4hCn64nj3GAGKAcSVxvPDd26rYVGbOv81JBfP4VyTJYRCI')
auth.set_access_token('317599687-5KhnMVM92ZIgeGiyimi6yMDEautnrRBN7gvZwJLq', 'HfPFz9pLAsF3VNEOKZeXgz4Up7YyubJ55U3XvjtPQ1vvK')
api = tweepy.API(auth)
def main():
loop = True
while (loop):
track = ['halftime show','halftime']
listen = SListener(api, 'halftime_show')
stream = tweepy.Stream(auth, listen)
print "Streaming started..."
try:
stream.filter(track = track)
loop = False
except:
print "error!"
loop = True
stream.disconnect()
sleep(5)
continue
if __name__ == '__main__':
main()
We ran our scripts for the duration of the Super Bowl, which was about four hours. During this time we collected over a gigabyte of tweets in the standard Twitter API JSON format. The Streaming API is rate limited, so we did not obtain literally every tweet that contained one of our keywords, especially because at times our scripts were manually limited by a short pause. This was because the stream of tweets at times exceeded what a=our computers could process and simply crashed the python scripts.
- Guide to streaming with Tweepy
- Tweepy Git page